This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(readr)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ stringr 1.5.1
## ✔ forcats 1.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
readr::read_csv("C:/Users/Amna Khalil/OneDrive - Higher Education Commission/Documents/Data analyst/Sample - EU Superstore - Orders (1).csv")
## Rows: 10000 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): Order ID, Order Date, Ship Date, Ship Mode, Customer ID, Customer ...
## dbl (6): Row ID, Sales, Quantity, Discount, Profit, UnitPrice
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 10,000 × 21
## `Row ID` `Order ID` `Order Date` `Ship Date` `Ship Mode` `Customer ID`
## <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 1 ES-2018-1311038 2/7/2018 2/11/2018 Standard Cla… AS-10045
## 2 2 ES-2018-1311038 2/7/2018 2/11/2018 Standard Cla… AS-10045
## 3 3 ES-2018-1311038 2/7/2018 2/11/2018 Standard Cla… AS-10045
## 4 4 ES-2018-1311038 2/7/2018 2/11/2018 Standard Cla… AS-10045
## 5 5 ES-2018-1311038 2/7/2018 2/11/2018 Standard Cla… AS-10045
## 6 6 ES-2018-1311038 2/7/2018 2/11/2018 Standard Cla… AS-10045
## 7 7 ES-2018-1311038 2/7/2018 2/11/2018 Standard Cla… AS-10045
## 8 8 ES-2016-5113958 8/2/2016 8/7/2016 Second Class EB-13840
## 9 9 ES-2016-5113958 8/2/2016 8/7/2016 Second Class EB-13840
## 10 10 ES-2016-5113958 8/2/2016 8/7/2016 Second Class EB-13840
## # ℹ 9,990 more rows
## # ℹ 15 more variables: `Customer Name` <chr>, Segment <chr>, City <chr>,
## # State <chr>, Country <chr>, Region <chr>, `Product ID` <chr>,
## # Category <chr>, `Sub-Category` <chr>, `Product Name` <chr>, Sales <dbl>,
## # Quantity <dbl>, Discount <dbl>, Profit <dbl>, UnitPrice <dbl>
Sample_EU_Superstore_Orders_1_ <- read_csv("C:/Users/Amna Khalil/OneDrive - Higher Education Commission/Documents/Data analyst/Sample - EU Superstore - Orders (1).csv")
## Rows: 10000 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): Order ID, Order Date, Ship Date, Ship Mode, Customer ID, Customer ...
## dbl (6): Row ID, Sales, Quantity, Discount, Profit, UnitPrice
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Sample_EU_Superstore_Orders_1_)
# Assuming Sample_EU_Superstore_Orders_1_ is a data frame with a column named Sales
# Assuming Sample_EU_Superstore_Orders_1_ is a data frame with a column named Sales
# Calculate total sales
total_sales <- sum(Sample_EU_Superstore_Orders_1_$Sales)
# Print the total sales
cat("Total Sales for All Orders: $", total_sales, "\n")
## Total Sales for All Orders: $ 2938089
# Assuming Sample_EU_Superstore_Orders_1_ is a data frame with a column named Customer ID
# Count the number of unique customers
unique_customers <- length(unique(Sample_EU_Superstore_Orders_1_$`Customer ID`))
# Print the number of unique customers
cat("Number of Unique Customers: ", unique_customers, "\n")
## Number of Unique Customers: 795
# Assuming Sample_EU_Superstore_Orders_1_ is a data frame with a column named Ship Mode
# Create a table of the count of orders for each ship mode
orders_by_ship_mode <- table(Sample_EU_Superstore_Orders_1_$`Ship Mode`)
# Display the count of orders for each ship mode on the screen
cat("Orders by Ship Mode:\n")
## Orders by Ship Mode:
print(orders_by_ship_mode)
##
## First Class Same Day Second Class Standard Class
## 1454 516 1976 6054
# Create a bar plot
barplot(orders_by_ship_mode, main = "Orders by Ship Mode", col = "skyblue", ylab = "Number of Orders", xlab = "Ship Mode")
# Proportion of orders in each Ship Mode category
ship_mode_proportion <- Sample_EU_Superstore_Orders_1_ %>%
group_by(`Ship Mode`) %>%
summarise(order_count = n()) %>%
mutate(proportion = order_count / sum(order_count))
# Bar plot for proportion of orders in each Ship Mode category
ggplot(ship_mode_proportion, aes(x = `Ship Mode`, y = proportion, fill = `Ship Mode`)) +
geom_bar(stat = "identity") +
labs(title = "Proportion of Orders in Each Ship Mode Category", x = "Ship Mode", y = "Proportion") +
theme_minimal()
# Box plot for Ship Mode and Profit
ggplot(Sample_EU_Superstore_Orders_1_, aes(x = `Ship Mode`, y = Profit, fill = `Ship Mode`)) +
geom_boxplot() +
labs(title = "Correlation between Ship Mode and Profit", x = "Ship Mode", y = "Profit") +
theme_minimal()
#Assuming you have the 'sales_data' data frame from the previous example
# Assuming your dataset is named 'Sample_EU_Superstore_Orders_1_'
str(Sample_EU_Superstore_Orders_1_)
## spc_tbl_ [10,000 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Row ID : num [1:10000] 1 2 3 4 5 6 7 8 9 10 ...
## $ Order ID : chr [1:10000] "ES-2018-1311038" "ES-2018-1311038" "ES-2018-1311038" "ES-2018-1311038" ...
## $ Order Date : chr [1:10000] "2/7/2018" "2/7/2018" "2/7/2018" "2/7/2018" ...
## $ Ship Date : chr [1:10000] "2/11/2018" "2/11/2018" "2/11/2018" "2/11/2018" ...
## $ Ship Mode : chr [1:10000] "Standard Class" "Standard Class" "Standard Class" "Standard Class" ...
## $ Customer ID : chr [1:10000] "AS-10045" "AS-10045" "AS-10045" "AS-10045" ...
## $ Customer Name: chr [1:10000] "Aaron Smayling" "Aaron Smayling" "Aaron Smayling" "Aaron Smayling" ...
## $ Segment : chr [1:10000] "Corporate" "Corporate" "Corporate" "Corporate" ...
## $ City : chr [1:10000] "Leeds" "Leeds" "Leeds" "Leeds" ...
## $ State : chr [1:10000] "England" "England" "England" "England" ...
## $ Country : chr [1:10000] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
## $ Region : chr [1:10000] "North" "North" "North" "North" ...
## $ Product ID : chr [1:10000] "OFF-ST-10000988" "TEC-AC-10004144" "OFF-LA-10001915" "OFF-ST-10004550" ...
## $ Category : chr [1:10000] "Office Supplies" "Technology" "Office Supplies" "Office Supplies" ...
## $ Sub-Category : chr [1:10000] "Storage" "Accessories" "Labels" "Storage" ...
## $ Product Name : chr [1:10000] "Fellowes Folders, Blue" "SanDisk Numeric Keypad, Bluetooth" "Avery Legal Exhibit Labels, 5000 Label Set" "Fellowes Folders, Wire Frame" ...
## $ Sales : num [1:10000] 79.2 388.9 35.2 50.9 307.4 ...
## $ Quantity : num [1:10000] 3 7 3 2 3 2 3 3 7 3 ...
## $ Discount : num [1:10000] 0 0 0 0 0 0 0 0 0 0 ...
## $ Profit : num [1:10000] 39.6 0 16.1 13.2 73.7 ...
## $ UnitPrice : num [1:10000] 26.4 55.6 11.7 25.5 102.5 ...
## - attr(*, "spec")=
## .. cols(
## .. `Row ID` = col_double(),
## .. `Order ID` = col_character(),
## .. `Order Date` = col_character(),
## .. `Ship Date` = col_character(),
## .. `Ship Mode` = col_character(),
## .. `Customer ID` = col_character(),
## .. `Customer Name` = col_character(),
## .. Segment = col_character(),
## .. City = col_character(),
## .. State = col_character(),
## .. Country = col_character(),
## .. Region = col_character(),
## .. `Product ID` = col_character(),
## .. Category = col_character(),
## .. `Sub-Category` = col_character(),
## .. `Product Name` = col_character(),
## .. Sales = col_double(),
## .. Quantity = col_double(),
## .. Discount = col_double(),
## .. Profit = col_double(),
## .. UnitPrice = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
head(Sample_EU_Superstore_Orders_1_)
## # A tibble: 6 × 21
## `Row ID` `Order ID` `Order Date` `Ship Date` `Ship Mode` `Customer ID`
## <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 1 ES-2018-1311038 2/7/2018 2/11/2018 Standard Class AS-10045
## 2 2 ES-2018-1311038 2/7/2018 2/11/2018 Standard Class AS-10045
## 3 3 ES-2018-1311038 2/7/2018 2/11/2018 Standard Class AS-10045
## 4 4 ES-2018-1311038 2/7/2018 2/11/2018 Standard Class AS-10045
## 5 5 ES-2018-1311038 2/7/2018 2/11/2018 Standard Class AS-10045
## 6 6 ES-2018-1311038 2/7/2018 2/11/2018 Standard Class AS-10045
## # ℹ 15 more variables: `Customer Name` <chr>, Segment <chr>, City <chr>,
## # State <chr>, Country <chr>, Region <chr>, `Product ID` <chr>,
## # Category <chr>, `Sub-Category` <chr>, `Product Name` <chr>, Sales <dbl>,
## # Quantity <dbl>, Discount <dbl>, Profit <dbl>, UnitPrice <dbl>
colnames(Sample_EU_Superstore_Orders_1_)
## [1] "Row ID" "Order ID" "Order Date" "Ship Date"
## [5] "Ship Mode" "Customer ID" "Customer Name" "Segment"
## [9] "City" "State" "Country" "Region"
## [13] "Product ID" "Category" "Sub-Category" "Product Name"
## [17] "Sales" "Quantity" "Discount" "Profit"
## [21] "UnitPrice"
# Convert 'Order Date' to Date format
Sample_EU_Superstore_Orders_1_$Order_Date <- as.Date(Sample_EU_Superstore_Orders_1_$`Order Date`, format = "%m/%d/%Y")
# Assuming your dataset has columns 'Order_Date', 'Ship Mode', and 'Sales'
Sample_EU_Superstore_Orders_1_ <- Sample_EU_Superstore_Orders_1_[order(Sample_EU_Superstore_Orders_1_$Order_Date), ]
# Convert 'Order Date' to Date format
Sample_EU_Superstore_Orders_1_$Order_Date <- as.Date(Sample_EU_Superstore_Orders_1_$`Order Date`, format = "%m/%d/%Y")
# Assuming your dataset has columns 'Order_Date', 'Ship_Mode', and 'Sales'
Sample_EU_Superstore_Orders_1_ <- Sample_EU_Superstore_Orders_1_[order(Sample_EU_Superstore_Orders_1_$Order_Date), ]
# Calculate rolling 30-day sales by ship mode
Sample_EU_Superstore_Orders_1_$Rolling_Sales <- ave(
Sample_EU_Superstore_Orders_1_$Sales,
Sample_EU_Superstore_Orders_1_$`Ship Mode`,
FUN = function(x) zoo::rollapply(x, width = 30, sum, fill = NA, align = "right")
)
# Plot the rolling 30-day sales by ship mode, ignoring missing values
ggplot(Sample_EU_Superstore_Orders_1_, aes(x = Order_Date, y = Rolling_Sales, color = `Ship Mode`)) +
geom_line(na.rm = TRUE) +
labs(
title = "Rolling 30-Day Sales by Ship Mode",
x = "Order Date",
y = "Rolling Sales (in dollars)"
) +
theme_minimal()
# Assuming Sample_EU_Superstore_Orders_1_ is a data frame with columns 'Order Date', 'Region', 'Country', and 'Sales'
# Convert 'Order Date' to Date format
Sample_EU_Superstore_Orders_1_$Order_Date <- as.Date(Sample_EU_Superstore_Orders_1_$`Order Date`, format = "%m/%d/%Y")
# Create a box plot for each region, with countries within each region
ggplot(Sample_EU_Superstore_Orders_1_, aes(x = Region, y = Sales, fill = Country)) +
geom_boxplot() +
labs(title = "Order Value by Region and Country", x = "Region", y = "Order Value", fill = "Country") +
theme_minimal()
knitr::opts_chunk$set(echo = TRUE)
# Assuming Sample_EU_Superstore_Orders_1 is a data frame with columns 'Order Date', 'Region', and 'Sales'
# Convert 'Order Date' to Date format
Sample_EU_Superstore_Orders_1_$Order_Date <- as.Date(Sample_EU_Superstore_Orders_1_$`Order Date`, format = "%m/%d/%Y")
# Create side-by-side box plots for each region
ggplot(Sample_EU_Superstore_Orders_1_, aes(x = Region, y = Sales, fill = Region)) +
geom_boxplot() +
labs(title = "Distribution of Order Value by Region", x = "Region", y = "Order Value") +
theme_minimal()
# Create a bar plot of sales by region
ggplot(Sample_EU_Superstore_Orders_1_, aes(x = Region, y = Sales, fill = Region)) +
geom_bar(stat = "identity") +
labs(title = "Distribution of Sales Across Different Regions", x = "Region", y = "Total Sales") +
theme_minimal()
# Assuming Sample_EU_Superstore_Orders_1_ is a data frame with columns 'City' and 'Sales'
# Aggregate sales data by city
sales_by_city <- aggregate(Sales ~ City, data = Sample_EU_Superstore_Orders_1_, sum)
# Identify top cities (adjust 'n' based on your preference)
top_cities <- head(sales_by_city[order(sales_by_city$Sales, decreasing = TRUE), ], n = 10)
# Create a bar plot for sales by top cities
ggplot(top_cities, aes(x = reorder(City, -Sales), y = Sales, fill = City)) +
geom_bar(stat = "identity") +
labs(title = "Sales by Top Cities", x = "City", y = "Sales") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
plotly::ggplotly(
Sample_EU_Superstore_Orders_1_ %>%
group_by(Country) %>%
summarise(order_total = sum(Sales)) %>%
ggplot(aes(x = Country, y = order_total, fill = Country)) +
geom_col() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
plotly::ggplotly(
df_summary <- Sample_EU_Superstore_Orders_1_ %>%
group_by(City, Country) %>%
summarise(City_orders = n(), .groups = 'drop') %>%
ungroup() %>%
group_by(Country) %>%
mutate(
country_orders = sum(City_orders),
order_proportion = round(City_orders / country_orders, 4)
) %>%
ggplot(aes(x = Country, y = order_proportion, fill = City)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
plotly::ggplotly(
df_summary <- Sample_EU_Superstore_Orders_1_ %>%
group_by(City, Country) %>%
summarise(City_orders = n(), .groups = 'drop') %>%
ungroup() %>%
group_by(Country) %>%
mutate(
country_orders = sum(City_orders),
order_proportion = round(City_orders / country_orders, 4)
) %>%
select(City, Country, City_orders, order_proportion) %>%
ggplot(aes(x = Country, y = City_orders, fill = order_proportion)) +
geom_bar(stat = "identity", position = "stack") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
# Create a bar plot of sales by sub-category
ggplot(Sample_EU_Superstore_Orders_1_, aes(x = `Sub-Category`, fill = `Sub-Category`)) +
geom_bar() +
labs(title = "Distribution of Sales by Sub-Category",
x = "Sub-Category",
y = "Total Sales") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for better readability
# Sort the dataset by Sales in descending order
sales_data_sorted <- Sample_EU_Superstore_Orders_1_[order(-Sample_EU_Superstore_Orders_1_$Sales), ]
# Select the top 5 countries
top_countries <- head(sales_data_sorted, 5)
# Create bar plots for top 5 countries
library(ggplot2)
ggplot(top_countries, aes(x = reorder(Country, -Sales), y = Sales, fill = Country)) +
geom_bar(stat = "identity") +
labs(title = "Top 5 Countries by Sales", x = "Country", y = "Sales") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Assuming you have the 'Sample_EU_Superstore_Orders_1_' data frame
# Convert 'Order Date' to Date format
Sample_EU_Superstore_Orders_1_$Order_Date <- as.Date(Sample_EU_Superstore_Orders_1_$`Order Date`, format = "%m/%d/%Y")
# Create a new variable for the quarter
Sample_EU_Superstore_Orders_1_$Quarter <- quarters(Sample_EU_Superstore_Orders_1_$Order_Date)
# Create a stacked barplot
ggplot(Sample_EU_Superstore_Orders_1_, aes(x = Quarter, y = Sales, fill = Category)) +
geom_bar(stat = "sum") +
labs(
title = "Quarterly Sales by Category",
x = "Quarter",
y = "Sales (in dollars)",
fill = "Category"
) +
theme_minimal()
# Find the product category with the highest sales
# Sum the sales for each product category
sales_by_category <- aggregate(Sales ~ Category, data = Sample_EU_Superstore_Orders_1_, sum)
# Find the product category with the highest sales
max_sales_category <- sales_by_category[which.max(sales_by_category$Sales), "Category"]
# Display the product category with the highest sales on the screen
cat("Product Category with the Highest Sales: ", max_sales_category, "\n")
## Product Category with the Highest Sales: Technology
# Create a bar plot
barplot(sales_by_category$Sales, names.arg = sales_by_category$Category, main = "Sales by Product Category", xlab = "Product Category", ylab = "Sales ($)", col = "purple", border = "black")
library(ggplot2)
# Assuming Sample_EU_Superstore_Orders_1_ is your data frame
Sample_EU_Superstore_Orders_1_$`Order Date` <- as.Date(Sample_EU_Superstore_Orders_1_$`Order Date`, format = "%m/%d/%Y")
# Arrange data by Date within each category
Sample_EU_Superstore_Orders_1_ <- Sample_EU_Superstore_Orders_1_ %>%
arrange(Category, `Order Date`)
# Calculate rolling past 30 days sales for each category
Sample_EU_Superstore_Orders_1_ <- Sample_EU_Superstore_Orders_1_ %>%
group_by(Category) %>%
mutate(rolling_sales_30d = zoo::rollapplyr(Sales, width = 30, FUN = sum, align = "right", fill = NA))
# Plot the result
ggplot(Sample_EU_Superstore_Orders_1_, aes(x = `Order Date`, y = rolling_sales_30d, color = Category)) +
geom_line() +
labs(title = "Rolling 30 Days Sales Trend by Category", x = "Order Date", y = "Rolling Sales (30 Days)", color = "Category") +
theme_minimal()
## Warning: Removed 87 rows containing missing values (`geom_line()`).
# Extract quarter from the Order Date
Sample_EU_Superstore_Orders_1_$Quarter <- quarters(Sample_EU_Superstore_Orders_1_$Order_Date)
# Line plot for sales variation over different quarters
ggplot(Sample_EU_Superstore_Orders_1_, aes(x = Quarter, y = Sales, group = 1)) +
geom_line() +
labs(title = "Sales Variation Over Different Quarters", x = "Quarter", y = "Sales") +
theme_minimal()
# Assuming Sample_EU_Superstore_Orders_1_ is a data frame with columns 'Order Date', 'State', and 'Sales'
# Convert 'Order Date' to Date format
Sample_EU_Superstore_Orders_1_$Order_Date <- as.Date(Sample_EU_Superstore_Orders_1_$`Order Date`, format = "%m/%d/%Y")
# Extract year and quarter from Order Date
Sample_EU_Superstore_Orders_1_$Year <- format(Sample_EU_Superstore_Orders_1_$Order_Date, "%Y")
Sample_EU_Superstore_Orders_1_$Quarter <- quarters(Sample_EU_Superstore_Orders_1_$Order_Date)
# Create a stacked barplot for each quarter
ggplot(Sample_EU_Superstore_Orders_1_, aes(x = Year, y = Sales, fill = Quarter)) +
geom_bar(stat = "identity") +
labs(title = "Stacked Barplot by Quarter", x = "Year", y = "Sales", fill = "Quarter") +
theme_minimal()
# Assuming Sample_EU_Superstore_Orders_1_ is a data frame with columns 'Customer ID', 'Product ID', and 'Sales'
# Identify top 5 customers and products
top_customers <- head(aggregate(Sales ~ `Customer ID`, data = Sample_EU_Superstore_Orders_1_, sum), 5)
top_products <- head(aggregate(Sales ~ `Product ID`, data = Sample_EU_Superstore_Orders_1_, sum), 5)
# Create bar plots for top 5 customers and products
ggplot(top_customers, aes(x = reorder(`Customer ID`, -Sales), y = Sales, fill = `Customer ID`)) +
geom_bar(stat = "identity") +
labs(title = "Top 5 Customers", x = "Customer ID", y = "Sales") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Create bar plots for top 5 customers and products
ggplot(top_products, aes(x = reorder(`Product ID`, -Sales), y = Sales, fill = `Product ID`)) +
geom_bar(stat = "identity") +
labs(title = "Top 5 Products", x = "Product ID", y = "Sales") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Calculate the correlation between discount and profit margin
correlation_discount_profit <- cor(Sample_EU_Superstore_Orders_1_$Discount, Sample_EU_Superstore_Orders_1_$Profit)
# Display the correlation on the screen
cat("Correlation between Discount and Profit Margin: ", correlation_discount_profit, "\n")
## Correlation between Discount and Profit Margin: -0.3391527